GOAL: To rank my genes according to differential expression,and perform thresholded over-representation analysis to highlight dominant themes in my top set of genes.
For the data I chosed from GEO is set GSE135339, this is a set of data which intercates a group of eRNAs that are actively involved in gene repression. The groups are treated with vehcile condition which enhancers in the absence, or presence (E2) of estradiol. First, I loaded the data from assignment1, then created a matrix preparing for HeapMap.
normalized_count_data <- readRDS(file=file.path(getwd(),"normalized_counts.rds"))
#to omit NA for generating heatmap
normlized_count_nona <- na.omit(normalized_count_data)
# Grouping the data
samples <- data.frame(lapply(colnames(normlized_count_nona)[3:14],
FUN=function(x){
unlist(strsplit(x, split = "\\_"))[c(1,2)]}))
colnames(samples) <- colnames(normlized_count_nona)[3:14]
rownames(samples) <- c("sample_type", "treatment_type")
samples <- data.frame(t(samples))
# Create heatmap from previous data
heatmap_matrix <- normlized_count_nona[, 3:ncol(normlized_count_nona)]
rownames(heatmap_matrix) <- normlized_count_nona$ensembl_gene_id
colnames(heatmap_matrix) <- colnames(normlized_count_nona[, 3:ncol(normlized_count_nona)])
heatmap_col = circlize::colorRamp2(c(min(heatmap_matrix), 0, max(heatmap_matrix)),
c("blue", "white", "red"))
overall_heatmap <- ComplexHeatmap::Heatmap(as.matrix(heatmap_matrix), show_row_dend = TRUE,
show_column_dend = TRUE, col = heatmap_col, show_column_names = TRUE, show_row_names = FALSE,
show_heatmap_legend = TRUE)
Pregenerated heatmap for all genes
From the Heapmap, we can see that there are some gene expression variations between different treatments and sample types. Then differential gene expression analysis is necessary.
# Label the MDS plots by patient type
limma::plotMDS(heatmap_matrix, labels = samples$sample_type, col = unlist(rainbow(20))[factor(samples$sample_type)],
main = "MDS plot by patient type")
# Label the MDS plots by treatment type
limma::plotMDS(heatmap_matrix, labels = samples$treatment_type, col = unlist(rainbow(20))[factor(samples$treatment_type)],
main = "MDS plot by Treatment type")
The two MDS plots shows variations between sample types and treatment types. we can see that the clustering between different treatmeent is closer than different patient type. Then, the treatment should effect on every type of patient from visualizing.
# Design differential expression model according to sample type for limma
model_design <- model.matrix(~samples$sample_type)
expressionMatrix <- as.matrix(normlized_count_nona[, 3:14])
rownames(expressionMatrix) <- normlized_count_nona$ensembl_gene_id
colnames(expressionMatrix) <- colnames(normlized_count_nona)[3:14]
minimalSet <- Biobase::ExpressionSet(assayData = expressionMatrix)
fit <- limma::lmFit(minimalSet, model_design)
fit2 <- limma::eBayes(fit, trend = TRUE)
topfit <- limma::topTable(fit2, coef = ncol(model_design), adjust.method = "BH",
number = nrow(expressionMatrix))
output_hits_pat_s <- merge(normlized_count_nona[, 1:2], topfit, by.y = 0, by.x = 2,
all.y = TRUE)
# Sort by P-value
output_hits_pat_s <- output_hits_pat_s[order(output_hits_pat_s$P.Value), ]
# Visualization
knitr::kable(output_hits_pat_s[1:10, 2:8], type = "html", row.names = FALSE, caption = "Fitted with sample type by limma")
| hgnc_symbol | logFC | AveExpr | t | P.Value | adj.P.Val | B |
|---|---|---|---|---|---|---|
| S100A7 | 204.310760 | 71.150153 | 17.13198 | 0e+00 | 1.85e-05 | 2.220239 |
| UBE2QL1 | -13.895369 | 9.518490 | -14.87151 | 0e+00 | 3.31e-05 | 2.047229 |
| DMKN | 155.824069 | 97.378712 | 14.61166 | 0e+00 | 3.31e-05 | 2.022781 |
| UPK2 | -85.062575 | 134.924994 | -14.31913 | 0e+00 | 3.31e-05 | 1.993886 |
| PI3 | 5.020554 | 2.103346 | 13.90549 | 0e+00 | 3.31e-05 | 1.950360 |
| KRT16 | 144.606629 | 57.086136 | 13.84989 | 0e+00 | 3.31e-05 | 1.944257 |
| BAMBI | -68.057866 | 48.384714 | -13.31635 | 0e+00 | 4.37e-05 | 1.882402 |
| ANXA6 | -74.792890 | 103.380743 | -12.85988 | 0e+00 | 5.61e-05 | 1.824348 |
| CLN3 | -103.780814 | 241.843920 | -12.37646 | 0e+00 | 6.70e-05 | 1.757059 |
| AGR2 | 365.146227 | 445.725688 | 12.20129 | 1e-07 | 6.70e-05 | 1.731063 |
# Number of gene pass the threshold p-value < 0.001
length(which(output_hits_pat_s$P.Value < 0.001))
## [1] 472
# Number of gene pass genes pass correction
length(which(output_hits_pat_s$adj.P.Val < 0.05))
## [1] 675
# Design differential expression model according to cell type + patient
model_design_pat <- model.matrix(~samples$sample_type + samples$treatment_type)
fit_pat <- limma::lmFit(minimalSet, model_design_pat)
fit2_pat <- limma::eBayes(fit_pat, trend = TRUE)
topfit_pat <- limma::topTable(fit2_pat, coef = ncol(model_design_pat), adjust.method = "BH",
number = nrow(expressionMatrix))
output_hits_pat <- merge(normalized_count_data[, 1:2], topfit_pat, by.y = 0, by.x = 2,
all.y = TRUE)
output_hits_pat <- output_hits_pat[order(output_hits_pat$P.Value), ]
# Showcase data
knitr::kable(output_hits_pat[1:10, 2:8], type = "html", row.names = FALSE, caption = "Fitted with sample &treatment type by limma")
| hgnc_symbol | logFC | AveExpr | t | P.Value | adj.P.Val | B |
|---|---|---|---|---|---|---|
| IL1R1 | 6.245494 | 8.996324 | 13.64091 | 0e+00 | 0.0001329 | 5.211485 |
| AHRR | 29.202730 | 53.759727 | 12.89265 | 0e+00 | 0.0001329 | 5.014879 |
| RASSF2 | 3.361061 | 4.310365 | 12.74555 | 0e+00 | 0.0001329 | 4.973246 |
| CTDSP2 | 39.603079 | 124.969650 | 12.28230 | 0e+00 | 0.0001504 | 4.835116 |
| BMF | 36.847691 | 42.767698 | 11.38331 | 1e-07 | 0.0002494 | 4.533261 |
| GBE1 | 3.371525 | 4.045235 | 11.30766 | 1e-07 | 0.0002494 | 4.505610 |
| ELF3 | 28.996950 | 114.649312 | 11.11179 | 1e-07 | 0.0002588 | 4.432268 |
| ZNF28 | 4.602828 | 7.264139 | 10.89088 | 2e-07 | 0.0002797 | 4.346396 |
| AMOTL2 | 40.086365 | 117.530370 | 10.78154 | 2e-07 | 0.0002797 | 4.302611 |
| HIVEP1 | 7.452622 | 14.970709 | 10.04876 | 4e-07 | 0.0003597 | 3.985545 |
# Number of gene pass the threshold p-value < 0.001
length(which(output_hits_pat$P.Value < 0.001))
## [1] 3403
# Number of gene pass genes pass correction
length(which(output_hits_pat$adj.P.Val < 0.05))
## [1] 7295
# Setup the model for EdgeR\
model_design_pat <- model.matrix(~samples$sample_type)
my_counts_matrix <- normlized_count_nona[, 3:ncol(normlized_count_nona)]
rownames(my_counts_matrix) = make.names(normlized_count_nona$hgnc_symbol, unique = TRUE)
d <- edgeR::DGEList(counts = my_counts_matrix, group = samples$cell_type)
d <- edgeR::estimateDisp(d, model_design_pat)
# fit the data
fit <- edgeR::glmQLFit(d, model_design_pat)
qlf.fit <- edgeR::glmQLFTest(fit, coef = "samples$sample_typeVector")
knitr::kable(topTags(qlf.fit), type = "html", row.names = FALSE, caption = "By sample type with edgeR")
|
|
|
|
qlf_output_hits <- edgeR::topTags(qlf.fit, sort.by = "PValue", n = nrow(normlized_count_nona))
# Number of gene pass the threshold p-value < 0.001
length(which(qlf_output_hits$table$PValue < 0.001))
## [1] 596
# Number of gene pass genes pass correction
length(which(qlf_output_hits$table$FDR < 0.05))
## [1] 834
I have calculated p-value for each gene using Limma. I choose to use 0.1% as my threhold for this data, because I checked 5% and 1%, both of them have large results, then since thet are all significant, i want to know the statistically highly significant at 0.1%.
I used empirical Bayes moderation to compute with model build in limma and use quasi-likelihood generalized log-linear model for multiple hypothesis correction.
For using just patient in the modle by limma, it labels 5116 genes with significant adjusted p values from the analysis, and for the complex model with use patient and treatment by limma labels 366genes. Also, by edgeR using patient, generated a simple model with produce 1348 genes with significant adjusted p values.
filtered_data_matrix <- load("filtered_data_matrix.RData")
simple_model_pvalues <- data.frame(ensembl_id = output_hits_pat_s$ensembl_gene_id,
simple_pvalue = output_hits_pat_s$P.Value)
pat_model_pvalues <- data.frame(ensembl_id = output_hits_pat$ensembl_gene_id, patient_pvalue = output_hits_pat$P.Value)
two_models_pvalues <- merge(simple_model_pvalues, pat_model_pvalues, by.x = 1, by.y = 1)
# Plot
two_models_pvalues$colour <- "black"
two_models_pvalues$colour[two_models_pvalues$simple_pvalue < 0.05] <- "orange"
two_models_pvalues$colour[two_models_pvalues$patient_pvalue < 0.05] <- "blue"
two_models_pvalues$colour[two_models_pvalues$simple_pvalue < 0.05 & two_models_pvalues$patient_pvalue <
0.05] <- "red"
plot(two_models_pvalues$simple_pvalue, two_models_pvalues$patient_pvalue, col = two_models_pvalues$colour,
xlab = "simple model p-values", ylab = "Patient model p-values", main = "Simple vs Patient Limma")
legend("topleft", inset = 0.01, legend = c("Simple", "Complex", "Both", "Not Signif"),
fill = c("orange", "blue", "red", "black"))
# highlight the gene used in the study
emsembl_of_interest <- normlized_count_nona$ensembl_gene_id[which(normlized_count_nona$hgnc_symbol ==
"EFEMP1" | normlized_count_nona$hgnc_symbol == "TM4SF1")]
two_models_pvalues$colour <- "grey"
two_models_pvalues$colour[two_models_pvalues$ensembl_id == emsembl_of_interest] <- "red"
plot(two_models_pvalues$simple_pvalue, two_models_pvalues$patient_pvalue, col = two_models_pvalues$colour,
xlab = "simple model p-values", ylab = "Patient model p-values", main = "Simple vs Patient Limma")
points(two_models_pvalues[which(two_models_pvalues$ensembl_id == emsembl_of_interest),
2:3], pch = 20, col = "red", cex = 1.5)
legend("topleft", inset = 0.01, legend = c("EFEMP1|TM4SF1", "rest"), fill = c("red",
"grey"))
Using the p-values we generated from the precious section, the comaprion model is been generated to visualize the relationship between two. Where there is only partial data which is closer to 0 that is in both dataset. Additionally, eRNAs of TM4SF1 and EFEMP1, these are the highlighted genes stated in the research, are labeled in the second plot.
# Compare Limma with Quasi(EdgeR)
qlf_pat_model_pvalues <- data.frame(hgnc_id = rownames(qlf_output_hits$table), qlf_patient_pvalue = qlf_output_hits$table$PValue)
limma_pat_model_pvalues <- data.frame(hgnc_id = output_hits_pat$hgnc_symbol, limma_patient_pvalue = output_hits_pat$P.Value)
two_models_pvalues <- merge(qlf_pat_model_pvalues, limma_pat_model_pvalues, by.x = 1,
by.y = 1)
two_models_pvalues$colour <- "black"
two_models_pvalues$colour[two_models_pvalues$qlf_patient_pvalue < 0.05] <- "orange"
two_models_pvalues$colour[two_models_pvalues$limma_patient_pvalue < 0.05] <- "blue"
two_models_pvalues$colour[two_models_pvalues$qlf_patient_pvalue < 0.05 & two_models_pvalues$limma_patient_pvalue <
0.05] <- "red"
plot(two_models_pvalues$qlf_patient_pvalue, two_models_pvalues$limma_patient_pvalue,
col = two_models_pvalues$colour, xlab = "QLF patient model p-values", ylab = "Limma Patient model p-values",
main = "QLF vs Limma (sample type)")
legend("topleft", inset = 0.01, legend = c("Simple", "Complex", "Both", "Not Signif"),
fill = c("orange", "blue", "red", "black"))
#Highlight the gene in study
hgnc_of_interest <- normlized_count_nona$hgnc_symbol[
which(normlized_count_nona$hgnc_symbol == "EFEMP1"|
normlized_count_nona$hgnc_symbol == "TM4SF1" )]
two_models_pvalues$colour <- "grey"
two_models_pvalues$colour[two_models_pvalues$hgnc_id==hgnc_of_interest] <- "red"
plot(two_models_pvalues$qlf_patient_pvalue,
two_models_pvalues$limma_patient_pvalue,
col = two_models_pvalues$colour,
xlab = "QLF patient model p-values",
ylab ="Limma Patient model p-values",
main="QLF vs Limma")
points(two_models_pvalues[
two_models_pvalues$hgnc_id==hgnc_of_interest,2:3],
pch=24, col="red", cex=1.5)
legend("topleft",inset=.01, legend=c("EFEMP1|TM4SF1", "rest"),
fill = c("red", "grey"))
For the model generated by QLF and Limma has more similarist to the complex model from the previous section. From the scatter point, the gene were similar to the model genertated from limma.
# plots from samples_type edgeR
E2 <- normlized_count_nona[,grep("E2", colnames(normlized_count_nona))]
Veh <-normlized_count_nona[,grep("Veh", colnames(normlized_count_nona))]
data_avg <- data.frame(hgnc_symbol = normlized_count_nona$hgnc_symbol,
E2 = rowMeans(E2), Veh = rowMeans(Veh))
qlf_output <- cbind(qlf_output_hits$table,
hgnc_symbol = rownames(qlf_output_hits$table))
normlized_qlf <- merge(data_avg, qlf_output, by.x= "hgnc_symbol",
by.y="hgnc_symbol", all=TRUE)
under_exp <- sum(normlized_qlf$logFC < 0 & normlized_qlf$PValue < 0.001)
over_exp <- sum(normlized_qlf$logFC > 0 & normlized_qlf$PValue < 0.001)
status <- rep(0, nrow(normlized_qlf))
status[normlized_qlf$logFC < 0 & normlized_qlf$PValue < 0.001] <- -1
status[normlized_qlf$logFC > 0 & normlized_qlf$PValue < 0.001] <- 1
plotMD(log2(normlized_qlf[,c(2,3)]), status=status, values=c(-1,1),
hl.col=c("blue","red"), main = "MA plots (sign gene with edgeR)")
# Plots from samples_type + patient model(Limma)
result.fit <- decideTests(fit2_pat)
par(mfrow=c(1,2))
plotMD(fit2_pat, status=result.fit[,"samples$sample_typeWT"], values = c(-1, 1), hl.col=c("blue","red"), main = "MA plots for Vector vs WT ")
plotMD(fit2_pat, status=result.fit[,"samples$treatment_typeVeh"], values = c(-1, 1), hl.col=c("blue","red"), main = "MA plots for Vector vs Veh ")
#### Q3:how the amount of differentially expressed genes using an MA Plot or a Volcano plot. Highlight genes of interest. I have plotted the MA plot at above using edgeR, NA downregulated gene were labeled in blue, and NA upregulated gene are labeled in red. In addtion, I also plotted Vector vs WT and Vector vs Veh, to have a better visualization about which is changing more.
# Subset the threshold gene
top_hits <- qlf_output$hgnc_symbol[qlf_output$PValue < 0.001]
# Create and order the heatmap matrix
heatmap_matrix_tophits <- t(
scale(t(heatmap_matrix[which(rownames(heatmap_matrix)
%in% normlized_count_nona$ensembl_gene_id),])))
heatmap_matrix_tophits<- heatmap_matrix_tophits[,
c(grep("E2",colnames(heatmap_matrix_tophits)),
grep("Veh",colnames(heatmap_matrix_tophits)))]
if(min(heatmap_matrix_tophits) == 0){
heatmap_col = circlize::colorRamp2(c( 0, max(heatmap_matrix_tophits)),
c( "white", "red"))
} else {
heatmap_col = circlize::colorRamp2(c(min(heatmap_matrix_tophits), 0,
max(heatmap_matrix_tophits)),
c("blue", "white", "red")) }
ComplexHeatmap::Heatmap(as.matrix(heatmap_matrix_tophits),
cluster_rows = TRUE,
cluster_columns = FALSE,
show_row_dend = TRUE,
show_column_dend = FALSE,
col=heatmap_col,
show_column_names = TRUE,
show_row_names = FALSE,
show_heatmap_legend = TRUE,
)
Part of my condition cluster together,for E2 condition in dDBD it showed a difference which DBD-truncated ERα was not able to bind to E2-repressed enhancers, thus it was expected that in E2 conditon, dDBD will have different result as other.
upregulated_genes <- qlf_output$hgnc_symbol[which(qlf_output$PValue < 0.001 &
qlf_output$logFC >0)]
downregulated_genes <-qlf_output$hgnc_symbol[which(qlf_output$PValue < 0.001 &
qlf_output$logFC < 0)]
write.table(x=upregulated_genes,
file=file.path("ER_upregulated_genes.txt"),sep = "\t",
row.names = FALSE,col.names = FALSE,quote = FALSE)
write.table(x=downregulated_genes,
file=file.path("ER_downregulated_genes.txt"),sep = "\t",
row.names = FALSE,col.names = FALSE,quote = FALSE)
I used G:Profiler[@gProfiler] as the tool and used threhold Benjamini-Hochberg FDR. I choose g:profiler because it is continously updated with new data from Ensembl and WormBase ParaSite, and have multiple human genome databases to operate for. Then, I will know very soon if their are new data might fit to data set.
I used 4 annotation data:GO: Biological Process(Release 2020-12-08 [@GO]),KEGG(Release 2020-12-14 [@KEGG]),Reactome(Release 2020-12-15 [@Reactome]() and WikiPathways(Release 2020-12-10 [@WP]). All of these are biological pathways, it can fullfill the biological process. Since, for the dataset, it is considering the pathways associated with ER alpha. #### Q3: How many genesets were returned with what thresholds? I used 1% threshodls, and it returned 1371 genesets under GO, 114 under KEGG, 473 under REAC and 98 under WP.